const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
#define vec2 float2
#define vec3 float3
#define vec4 float4
#define rgb xyz
#define rgba xyzw
#define _max(a,b) (a)>(b)?(a):(b)
#define _min(a,b) (a)<(b)?(a):(b)
vec4 INPUTSRC(image2d_t src_data,__global FilterParam* param, vec2 tc)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, tc);
}

vec4 INPUT(image2d_t src_data, vec2 tc)
{
	return read_imagef(src_data, sampler, tc);
}

__kernel void MAIN(__read_only image2d_t image,		// image buffer input
				   __read_only image2d_t merge,		// merge buffer input
				   	__write_only image2d_t dest_data,
				   __global FilterParam* param,
				   __private int alpha)	 				// blend factor, scrope[0-100]	
{
	float m_mergeAlpha = 1.0f;
	
	int w = get_global_id0( param);
	int h = get_global_id1( param);
	
	int width = get_global_size(0);
	int height = get_global_size(1);
	
//	if(w < width && h < height)
	{	
		float2 uv = (float2)(((float)(w) + 0.5f) / width, ((float)(h) + 0.5f) / height);
		float4 orignalColor = 	INPUTSRC(image, param, uv); // bgra
		float4 overlayColor = INPUT(merge, uv); // bgra
		
		float sa = overlayColor.w * m_mergeAlpha;
		float da = 1.0f - sa;
		float4 fragColor = orignalColor * da + overlayColor * sa;
		fragColor.w = orignalColor.w;
		
		float factor = (float)(alpha)/100.0f;
		fragColor = fragColor * factor + (1.0f - factor) * orignalColor;
		
		write_imagef(dest_data,  (int2)(get_global_id(0), get_global_id(1)), fragColor);
	}	
}